K-means Clustering Algorithm
Start with k random clusters

Calculate means

Select cluster based on which mean point is closest to

Adjust menas and repeat

Potential Issues
- What happens with high dimensionality?
- What happens when dimensions aren’t scaled?
Setup
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
library(tidyverse)
library(tidytext)
data(stop_words)
wine <- read_rds("../resources/variety-project.rds") %>%
rowid_to_column("id")
glimpse(wine)
## Observations: 4,053
## Variables: 5
## $ id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
## $ variety <chr> "Pinot_Gris", "Pinot_Noir", "Pinot_Noir", "Pinot_Noi…
## $ price <dbl> 14, 65, 20, 50, 22, 25, 38, 28, 45, 22, 40, 50, 52, …
## $ points <dbl> 87, 87, 87, 86, 86, 86, 91, 85, 85, 85, 89, 89, 92, …
## $ description <chr> "Tart and snappy, the flavors of lime flesh and rind…
Find top words in each variety
top_words <- wine %>%
unnest_tokens(word, description) %>%
anti_join(stop_words) %>%
filter(!(word %in% c("wine","flavors"))) %>%
count(id, word) %>%
group_by(id) %>%
mutate(exists = if_else(n>0,1,0)) %>%
ungroup %>%
right_join(wine, by="id") %>%
count(variety, word) %>%
group_by(variety) %>%
top_n(2,n) %>%
ungroup %>%
select(word) %>%
distinct()
top_words
| apple |
| fruit |
| pear |
| cherry |
| dry |
| finish |
Create features from top words
wino <- wine %>%
unnest_tokens(word, description) %>%
anti_join(stop_words) %>%
filter(word %in% top_words$word) %>%
count(id, word) %>%
group_by(id) %>%
mutate(exists = if_else(n>0,1,0)) %>%
ungroup %>%
pivot_wider(id_cols = id, names_from = word, values_from = exists, values_fill = c(exists=0)) %>%
right_join(wine, by="id") %>%
replace(.,is.na(.),0) %>%
mutate(price=log(price)) %>%
mutate(price=scale(price), points=scale(points)) %>%
select(-id,-variety, -description)
Look at the data
head(wino)
| 0 |
0 |
0 |
0 |
0 |
0 |
-1.7597764 |
-0.8716431 |
| 0 |
0 |
0 |
0 |
0 |
0 |
1.2196684 |
-0.8716431 |
| 0 |
0 |
0 |
0 |
0 |
0 |
-1.0676168 |
-0.8716431 |
| 1 |
1 |
0 |
0 |
0 |
0 |
0.7105271 |
-1.2362597 |
| 0 |
0 |
0 |
0 |
0 |
0 |
-0.8826589 |
-1.2362597 |
| 0 |
0 |
0 |
0 |
0 |
0 |
-0.6345868 |
-1.2362597 |
Basic K-means cluster
kclust <- kmeans(wino, centers = 3)
kclust$centers
## finish fruit cherry dry pear apple
## 1 0.2905457 0.6033820 0.4112221 0.01691007 0.03228286 0.05841660
## 2 0.2460595 0.5315236 0.1225919 0.08669002 0.20052539 0.21366025
## 3 0.2975155 0.6304348 0.2826087 0.03664596 0.04906832 0.08322981
## price points
## 1 0.97207982 1.0128162
## 2 -1.15843641 -0.9246434
## 3 0.03618542 -0.1625659
glance(kclust)
| 11609.88 |
6392.102 |
5217.775 |
3 |
Add clusters to original dataset (from Broom)
wink <- augment(kclust,wino)
head(wink)
| 0 |
0 |
0 |
0 |
0 |
0 |
-1.7597764 |
-0.8716431 |
2 |
| 0 |
0 |
0 |
0 |
0 |
0 |
1.2196684 |
-0.8716431 |
3 |
| 0 |
0 |
0 |
0 |
0 |
0 |
-1.0676168 |
-0.8716431 |
2 |
| 1 |
1 |
0 |
0 |
0 |
0 |
0.7105271 |
-1.2362597 |
3 |
| 0 |
0 |
0 |
0 |
0 |
0 |
-0.8826589 |
-1.2362597 |
2 |
| 0 |
0 |
0 |
0 |
0 |
0 |
-0.6345868 |
-1.2362597 |
2 |
Visualize clusters
wink %>%
pivot_longer(c(apple,cherry,dry),names_to = "feature") %>%
ggplot(aes(value, fill=.cluster))+
geom_density(alpha=0.3)+
facet_wrap(~feature)

wink %>%
pivot_longer(c(finish, fruit, pear),names_to = "feature") %>%
ggplot(aes(value, fill=.cluster))+
geom_density(alpha=0.3)+
facet_wrap(~feature)

wink %>%
pivot_longer(c(points,price),names_to = "feature") %>%
ggplot(aes(value, fill=.cluster))+
geom_density(alpha=0.3)+
facet_wrap(~feature)

Try different numbers of clusters
kclusts <- tibble(k = 1:9) %>%
mutate(
kclust = map(k, ~kmeans(wino, .x)),
glanced = map(kclust, glance),
augmented = map(kclust, augment, wino)
)
Plot the different clusters on two axes
assignments <- kclusts %>%
unnest(augmented)
ggplot(assignments, aes(price, points)) +
geom_point(aes(color = .cluster), alpha=0.3) +
facet_wrap(~ k)

Look at improvement in within-cluster error
clusterings <- kclusts %>%
unnest(glanced, .drop = TRUE)
ggplot(clusterings, aes(k, tot.withinss)) +
geom_line()

- What are some real world applications of clustering?
- How might I choose number of clusters in practice?
Hierarchical Clustering




Create the hierarchical cluster
swine <- wino %>%
sample_n(200)
hclustr <- hclust(d=dist(swine))
summary(hclustr)
## Length Class Mode
## merge 398 -none- numeric
## height 199 -none- numeric
## order 200 -none- numeric
## labels 0 -none- NULL
## method 1 -none- character
## call 2 -none- call
## dist.method 1 -none- character
Plot the dendrogram
plot(hclustr)
abline(h=3, col="red")

Assign clusters
hclustr <- hclust(d=dist(wino))
cluster <- cutree(hclustr, k=5)
swine <- wino %>%
add_column(cluster) %>%
mutate(cluster=as_factor(cluster))
head(swine)
| 0 |
0 |
0 |
0 |
0 |
0 |
-1.7597764 |
-0.8716431 |
1 |
| 0 |
0 |
0 |
0 |
0 |
0 |
1.2196684 |
-0.8716431 |
2 |
| 0 |
0 |
0 |
0 |
0 |
0 |
-1.0676168 |
-0.8716431 |
1 |
| 1 |
1 |
0 |
0 |
0 |
0 |
0.7105271 |
-1.2362597 |
2 |
| 0 |
0 |
0 |
0 |
0 |
0 |
-0.8826589 |
-1.2362597 |
1 |
| 0 |
0 |
0 |
0 |
0 |
0 |
-0.6345868 |
-1.2362597 |
1 |
Visualize clusters
swine %>%
pivot_longer(c(apple,cherry,dry),names_to = "feature") %>%
ggplot(aes(value, fill=cluster))+
geom_density(alpha=0.3)+
facet_wrap(~feature)

swine %>%
pivot_longer(c(finish, fruit, pear),names_to = "feature") %>%
ggplot(aes(value, fill=cluster))+
geom_density(alpha=0.3)+
facet_wrap(~feature)

swine %>%
pivot_longer(c(points,price),names_to = "feature") %>%
ggplot(aes(value, fill=cluster))+
geom_density(alpha=0.3)+
facet_wrap(~feature)

What do you see as some of the issues with Hierarchical clustering?